In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
from PIL import  Image
import pandas as pd
import seaborn as sns
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly
%matplotlib inline
In [2]:
import plotly.io as pio
pio.renderers.default='notebook'
In [3]:
data = pd.read_csv('C:/Users/HughOOZ/Downloads/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.head()
Out[3]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [4]:
print('row:',data.shape[0])
print('columns:',data.shape[1])
print('\nfeatures:',data.columns.tolist())
print('\nmissing values:',data.isnull().sum().values.sum())
print('\nunique values:','\n',data.nunique())
row: 1470
columns: 35

features: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

missing values: 0

unique values: 
 Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithCurrManager          18
dtype: int64
In [5]:
def age(data) :
    if data["Age"] <= 25 :
        return "Age_0-25"
    elif (data["Age"] > 25) and (data["Age"] <= 35 ):
        return "Age_25-35"
    elif (data["Age"] > 35) and (data["Age"] <= 45) :
        return "Age_35-45"
    elif (data["Age"] > 45) and (data["Age"] <= 55) :
        return "Age_45-55"
    elif data["Age"] > 55 :
        return "Age_55-60"

data["age_group"] = data.apply(lambda x:age(x), axis=1)

data = data.drop(['StandardHours','Over18','EmployeeCount'], axis=1)

attrition = data[data['Attrition'] == 'Yes']
not_attrition = data[data['Attrition'] == 'No']
Id_col = ['EmployeeNumber']
label_col = ['Attrition']
cat_cols = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in label_col]
num_cols = [x for x in data.columns if x not in cat_cols + label_col + Id_col]
print(cat_cols,'\n',num_cols)
['BusinessTravel', 'Department', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'OverTime', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance', 'age_group'] 
 ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
In [6]:
lab = data['Attrition'].value_counts().keys().tolist()
val = data['Attrition'].value_counts().values.tolist()
trace = go.Pie(labels = lab ,
               values = val ,
               marker = dict(colors =  [ 'royalblue' ,'lime'],
                             line = dict(color = "white",
                                         width =  1.3)
                            ),
               rotation = 90,
               hoverinfo = "label+value+text",
               hole = .5
              )
layout = go.Layout(dict(title = "Employee Attrition in data",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)
plt.show()
In [7]:
def plot_pie(column) :
    
    trace1 = go.Pie(values  = attrition[column].value_counts().values.tolist(),
                    labels  = not_attrition[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "attrition",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = not_attrition[column].value_counts().values.tolist(),
                    labels  = not_attrition[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "not attrition" 
                   )


    layout = go.Layout(dict(title = column + " distribution in attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "attrition",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "not attrition",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)


def histogram(column) :
    trace1 = go.Histogram(x  = attrition[column],
                          histnorm= "percent",
                          name = "attrition",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = not_attrition[column],
                          histnorm = "percent",
                          name = "Not attrition",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " distribution in attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    

for i in cat_cols :
    plot_pie(i)

for i in num_cols :
    histogram(i)
In [8]:
def scatter_matrix(df)  :
    
    df  = df.sort_values(by = "Attrition" ,ascending = True)
    classes = df["Attrition"].unique().tolist()
    classes
    
    class_code  = {classes[k] : k for k in range(2)}
    class_code

    color_vals = [class_code[cl] for cl in df["Attrition"]]
    color_vals

    pl_colorscale = "Portland"

    pl_colorscale

    text = [df.loc[k,"Attrition"] for k in range(len(df))]
    text

    trace = go.Splom(dimensions = [dict(label = x,values = df[x]) for x in ['MonthlyIncome','MonthlyRate','DistanceFromHome']],
                     text = text,
                     marker = dict(color = color_vals,
                                   colorscale = pl_colorscale,
                                   size = 3,
                                   showscale = False,
                                   line = dict(width = .1,
                                               color='rgb(230,230,230)'
                                              )
                                  )
                    )
    axis = dict(showline  = True,
                zeroline  = False,
                gridcolor = "#fff",
                ticklen   = 4
               )
    
    layout = go.Layout(dict(title  = 
                            "Scatter plot matrix for Numerical columns for attrition",
                            autosize = False,
                            height = 800,
                            width  = 800,
                            dragmode = "select",
                            hovermode = "closest",
                            plot_bgcolor  = 'rgba(240,240,240, 0.95)',
                            xaxis1 = dict(axis),
                            yaxis1 = dict(axis),
                            xaxis2 = dict(axis),
                            yaxis2 = dict(axis),
                            xaxis3 = dict(axis),
                            yaxis3 = dict(axis)
                           )
                      )
    plt_data   = [trace]
    fig = go.Figure(data = plt_data,layout = layout )
    py.iplot(fig)
    
scatter_matrix(data)
In [9]:
age_at = attrition['age_group'].value_counts().reset_index()
age_at.columns  = ["age_group","count"]
age_nat = not_attrition["age_group"].value_counts().reset_index()
age_nat.columns = ["age_group","count"]

trace1 = go.Bar(x = age_at["age_group"]  , y = age_at["count"],
                name = "Age_Attrition",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

trace2 = go.Bar(x = age_nat["age_group"] , y = age_nat["count"],
                name = "Age_NotAttrition",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

layout = go.Layout(dict(title = " attrition in age groups",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "age group",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "count",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                       )
                  )
plt_data = [trace1,trace2]
fig  = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

Id_col = ['EmployeeNumber']
target_col = ["Attrition"]

cat_cols = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]

num_cols = [x for x in data.columns if x not in cat_cols + target_col + Id_col]

bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()

multi_cols = [i for i in cat_cols if i not in bin_cols]

le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
    
data = pd.get_dummies(data = data,columns = multi_cols )

std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
In [11]:
summary = (df_data_og[[i for i in df_data_og.columns if i not in Id_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)
In [12]:
correlation = data.corr()

matrix_cols = correlation.columns.tolist()

corr_array  = np.array(correlation)

trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

plt_data = [trace]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X = data[[i for i in data.columns if i not in Id_col + target_col]]
Y = data[target_col + Id_col]
principal_components = pca.fit_transform(X)
pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
pca_data["Attrition"] = pca_data["Attrition"].replace({1:"Attrition",0:"Not Attrition"})

pca_data
Out[13]:
PC1 PC2 Attrition EmployeeNumber
0 -0.169603 1.534182 Attrition 1
1 0.928866 -0.336469 Not Attrition 2
2 -2.262723 1.543330 Attrition 4
3 -0.772330 -0.747641 Not Attrition 5
4 -1.673791 0.850173 Not Attrition 7
... ... ... ... ...
1465 -0.634666 0.634130 Not Attrition 2061
1466 0.889056 0.172504 Not Attrition 2062
1467 -1.304994 -0.784013 Not Attrition 2064
1468 1.228922 0.321258 Not Attrition 2065
1469 -1.304743 -0.285835 Not Attrition 2068

1470 rows × 4 columns

In [14]:
def pca_scatter(target,color) :
    tracer = go.Scatter(x = pca_data[pca_data["Attrition"] == target]["PC1"] ,
                        y = pca_data[pca_data["Attrition"] == target]["PC2"],
                        name = target,mode = "markers",
                        marker = dict(color = color,
                                      line = dict(width = .5),
                                      symbol =  "diamond-open"),
                        text = ("EmployeeNumber : " + 
                                str(pca_data[pca_data["Attrition"] == target]['EmployeeNumber']))
                       )
    return tracer

layout = go.Layout(dict(title = "Visualising data with principal components",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 1",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 2",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        height = 600
                       )
                  )

trace1 = pca_scatter('Attrition','red')
trace2 = pca_scatter('Not Attrition','royalblue')
plt_data = [trace2,trace1]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
In [15]:
bi_cs = data.nunique()[data.nunique() == 2].keys()
dat_rad = data[bi_cs]

def plot_radar(df,aggregate,title) :
    data_frame = df[df["Attrition"] == aggregate] 
    data_frame_x = data_frame[bi_cs].sum().reset_index()
    data_frame_x.columns  = ["feature","yes"]
    data_frame_x["no"]    = data_frame.shape[0]  - data_frame_x["yes"]
    data_frame_x  = data_frame_x[data_frame_x["feature"] != "Attrition"]
    
    #count of 1's(yes)
    trace1 = go.Scatterpolar(r = data_frame_x["yes"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 1's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            )
    #count of 0's(No)
    trace2 = go.Scatterpolar(r = data_frame_x["no"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 0's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            ) 
    layout = go.Layout(dict(polar = dict(radialaxis = dict(visible = True,
                                                           side = "counterclockwise",
                                                           showline = True,
                                                           linewidth = 2,
                                                           tickwidth = 2,
                                                           gridcolor = "white",
                                                           gridwidth = 2),
                                         angularaxis = dict(tickfont = dict(size = 10),
                                                            layer = "below traces"
                                                           ),
                                         bgcolor  = "rgb(243,243,243)",
                                        ),
                            paper_bgcolor = "rgb(243,243,243)",
                            title = title,height = 700))
    
    plt_data = [trace2,trace1]
    fig = go.Figure(data=plt_data,layout=layout)
    py.iplot(fig)

#plot
plot_radar(dat_rad,1,"Attrition")
plot_radar(dat_rad,0,"Not Attrition")
In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from yellowbrick.classifier import DiscriminationThreshold

train,test = train_test_split(data,test_size = .25 ,random_state = None)

cols    = [i for i in data.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

#Function attributes
#dataframe     - processed dataframe
#Algorithm     - Algorithm used 
#training_x    - predictor variables dataframe(training)
#testing_x     - predictor variables dataframe(testing)
#training_y    - target variable(training)
#training_y    - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic 
                                 #regression,features for tree based models)

#threshold_plot - if True returns threshold plot for model
    
def data_attrition_prediction(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf,threshold_plot) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc,"\n")
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
    
    trace1 = go.Heatmap(z = conf_matrix ,
                        x = ["Not Attrition","Attrition"],
                        y = ["Not Attrition","Attrition"],
                        showscale  = False,colorscale = "Picnic",
                        name = "matrix")
    
    #plot roc curve
    trace2 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2))
    trace3 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot coeffs
    trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                    name = "coefficients",
                    marker = dict(color = coef_sumry["coefficients"],
                                  colorscale = "Picnic",
                                  line = dict(width = .6,color = "black")))
    
    #subplots
    fig = plotly.subplots.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                            subplot_titles=('Confusion Matrix',
                                            'Receiver operating characteristic',
                                            'Feature Importances'))
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,1,2)
    fig.append_trace(trace4,2,1)
    
    fig['layout'].update(showlegend=False, title="Model performance" ,
                         autosize = False,height = 900,width = 800,
                         plot_bgcolor = 'rgba(240,240,240, 0.95)',
                         paper_bgcolor = 'rgba(240,240,240, 0.95)',
                         margin = dict(b = 195))
    fig["layout"]["xaxis2"].update(dict(title = "false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
    fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),
                                        tickangle = 90))
    py.iplot(fig)
    
    if threshold_plot == True : 
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x,training_y)
        visualizer.poof()
        
logit  = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

data_attrition_prediction(logit,train_X,test_X,train_Y,test_Y,
                         cols,"coefficients",threshold_plot = True)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.97      0.92       303
           1       0.69      0.34      0.45        65

    accuracy                           0.86       368
   macro avg       0.78      0.65      0.69       368
weighted avg       0.84      0.86      0.84       368

Accuracy   Score :  0.8559782608695652
Area under curve :  0.6527291190657529 

In [17]:
from imblearn.over_sampling import SMOTE

cols    = [i for i in data.columns if i not in Id_col+target_col]

smote_X = data[cols]
smote_Y = data[target_col]

#Split train and test data
smote_train_X,smote_test_X,smote_train_Y,smote_test_Y = train_test_split(smote_X,smote_Y,
                                                                         test_size = .25 ,
                                                                         random_state = 111)

#oversampling minority class using smote
os = SMOTE(random_state = 0)
os_smote_X,os_smote_Y = os.fit_sample(smote_train_X,smote_train_Y)
os_smote_X = pd.DataFrame(data = os_smote_X,columns=cols)
os_smote_Y = pd.DataFrame(data = os_smote_Y,columns=target_col)
###



logit_smote = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

data_attrition_prediction(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"coefficients",threshold_plot = True)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       303
           1       0.68      0.40      0.50        65

    accuracy                           0.86       368
   macro avg       0.78      0.68      0.71       368
weighted avg       0.85      0.86      0.85       368

Accuracy   Score :  0.8614130434782609
Area under curve :  0.6801980198019801 

In [18]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(os_smote_X,os_smote_Y.values.ravel())

rfe.support_
rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in data.columns if i not in Id_col + target_col],
                       "ranking" : rfe.ranking_,
                      })
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


#separating train and test data
train_rf_X = os_smote_X[cols]
train_rf_Y = os_smote_Y
test_rf_X  = test[cols]
test_rf_Y  = test[target_col]

logit_rfe = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
#applying model
data_attrition_prediction(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                         cols,"coefficients",threshold_plot = True)

tab_rk = ff.create_table(idc_rfe)
py.iplot(tab_rk)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.82      1.00      0.90       303
           1       0.00      0.00      0.00        65

    accuracy                           0.82       368
   macro avg       0.41      0.50      0.45       368
weighted avg       0.68      0.82      0.74       368

Accuracy   Score :  0.8233695652173914
Area under curve :  0.5 

In [19]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

#select columns
cols = [i for i in data.columns if i not in Id_col + target_col ]

#dataframe with non negative values
df_x = df_data_og[cols]
df_y = df_data_og[target_col]

#fit model with k= 3
select = SelectKBest(score_func = chi2,k = 3)
fit    = select.fit(df_x,df_y)

#Summerize scores
print ("scores")
print (fit.scores_)
print ("P - Values")
print (fit.pvalues_)

#create dataframe
score = pd.DataFrame({"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)


#createing new label for categorical and numerical columns
score["feature_type"] = np.where(score["features"].isin(num_cols),"Numerical","Categorical")

#plot
trace  = go.Scatter(x = score[score["feature_type"] == "Categorical"]["features"],
                    y = score[score["feature_type"] == "Categorical"]["scores"],
                    name = "Categorial",mode = "lines+markers",
                    marker = dict(color = "red",
                                  line = dict(width =1))
                   )

trace1 = go.Bar(x = score[score["feature_type"] == "Numerical"]["features"],
                y = score[score["feature_type"] == "Numerical"]["scores"],name = "Numerical",
                marker = dict(color = "royalblue",
                              line = dict(width =1)),
                xaxis = "x2",yaxis = "y2"
               )
layout = go.Layout(dict(title = "Scores for Categorical & Numerical features",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     tickfont = dict(size =10),
                                     domain=[0, 0.7],
                                     tickangle = 90,zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "scores",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(b=200),
                        xaxis2=dict(domain=[0.8, 1],tickangle = 90,
                                    gridcolor = 'rgb(255, 255, 255)'),
                        yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                        )
                  )

plt_data=[trace,trace1]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
scores
[5.10086544e-01 6.38450667e+01 1.03810387e-02 7.31793403e+00
 1.58166228e+01 1.04785690e+00 3.98626382e-01 3.70291570e+00
 6.69446524e+00 5.61186870e-01 5.63029133e-02 5.94202281e-01
 7.06703000e-01 1.15556634e+00 1.91887761e+00 9.24044433e-01
 4.07915378e+00 2.22213257e+00 4.44605699e-01 6.43586003e+00
 1.78901236e+01 2.75750453e-01 1.98767614e+00 2.35033126e+00
 1.90389458e+01 2.19099038e+00 1.90226410e+00 5.35982099e+00
 4.18799770e+01 1.60966234e+01 3.35925785e-01 1.01965986e+01
 4.01988831e+00 8.29251836e+00 1.85975262e+00 1.16994947e+01
 9.49613598e+00 9.12658865e+00 1.09780098e+01 1.52305445e-04
 4.47333341e-01 3.42902680e+01 9.63613640e+00 1.94031230e-02
 5.05810694e-02 7.79895642e+00 8.79442184e+00 6.59758576e+00
 3.07716690e+01 4.18770412e+00 3.61936397e-01 1.45192354e-01
 5.46234984e-01 3.20150954e+01 1.99408964e+01 8.49620612e+00
 1.46103161e-01 5.42457934e+00 6.23607176e-01 1.30104406e+00
 1.55501237e+00 2.28824367e+00 1.67106543e+00 2.28289943e+00
 1.35378746e+01 1.38552193e-01 2.38568462e+00 2.62985690e-01
 3.51195897e+01 4.08560082e+00 1.66413568e+01 3.56405252e+00
 2.80785011e-02 8.41552768e+01 9.56580494e+02 6.37721416e+01
 4.31779090e-01 1.27922294e+05 1.19663355e+03 6.43865444e+00
 2.35026858e-01 2.30721618e+02 1.42100054e+02 1.17522596e+02
 7.60172347e+00 1.10671534e+02]
P - Values
[4.75101394e-001 1.34598985e-015 9.18846065e-001 6.82698483e-003
 6.97867071e-005 3.06001178e-001 5.27799504e-001 5.43174721e-002
 9.67126726e-003 4.53782430e-001 8.12437754e-001 4.40798708e-001
 4.00540514e-001 2.82386945e-001 1.65980445e-001 3.36415272e-001
 4.34153967e-002 1.36045027e-001 5.04907817e-001 1.11838995e-002
 2.34032489e-005 5.99500145e-001 1.58584072e-001 1.25256388e-001
 1.28077549e-005 1.38820030e-001 1.67825114e-001 2.06058887e-002
 9.70513958e-011 6.01912603e-005 5.62190430e-001 1.40699939e-003
 4.49666913e-002 3.98087437e-003 1.72653355e-001 6.25170742e-004
 2.05905062e-003 2.51921027e-003 9.21993914e-004 9.90153390e-001
 5.03604036e-001 4.74749891e-009 1.90786128e-003 8.89216967e-001
 8.22055237e-001 5.22764179e-003 3.02152987e-003 1.02117140e-002
 2.90244568e-008 4.07182051e-002 5.47432772e-001 7.03172718e-001
 4.59860171e-001 1.52979201e-008 7.98733582e-006 3.55887780e-003
 7.02287484e-001 1.98552181e-002 4.29709987e-001 2.54022602e-001
 2.12397003e-001 1.30357360e-001 1.96115889e-001 1.30807133e-001
 2.33797012e-004 7.09724591e-001 1.22451237e-001 6.08076326e-001
 3.10065840e-009 4.32500723e-002 4.51556003e-005 5.90434269e-002
 8.66924400e-001 4.57401507e-020 4.92392441e-210 1.39674848e-015
 5.11117265e-001 0.00000000e+000 3.28793276e-262 1.11663196e-002
 6.27821283e-001 4.14926182e-052 9.24706566e-033 2.20572828e-027
 5.83125339e-003 6.98336293e-026]
In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from graphviz import Source
from IPython.display import SVG,display

#top 3 categorical features
features_cat  = score[score["feature_type"] == "Categorical"]["features"][:3].tolist()

#top 3 numerical features
features_num  = score[score["feature_type"] == "Numerical"]["features"][:3].tolist()


#Function attributes
#columns        - selected columns
#maximum_depth  - depth of tree
#criterion_type - ["gini" or "entropy"]
#split_type     - ["best" or "random"]
#Model Performance - True (gives model output)

def plot_decision_tree(columns,maximum_depth,criterion_type,
                       split_type,model_performance = None) :
    
    #separating dependent and in dependent variables
    dtc_x = df_x[columns]
    dtc_y = df_y[target_col]
    
    #model
    dt_classifier = DecisionTreeClassifier(max_depth = maximum_depth,
                                           splitter  = split_type,
                                           criterion = criterion_type,
                                          )
    dt_classifier.fit(dtc_x,dtc_y)
    
    #plot decision tree
    graph = Source(tree.export_graphviz(dt_classifier,out_file=None,
                                        rounded=True,proportion = False,
                                        feature_names = columns, 
                                        precision  = 2,
                                        class_names=["Not Attrition","Attrition"],
                                        filled = True                         
                                       )
                  )
    
    #model performance
    if model_performance == True :
        data_attrition_prediction(dt_classifier,
                                 dtc_x,test_X[columns],
                                 dtc_y,test_Y,
                                 columns,"features",threshold_plot = True)
    display(graph)
    
plot_decision_tree(features_num,3,"gini","best")
Tree 0 MonthlyIncome <= 2802.0 gini = 0.27 samples = 1470 value = [1233, 237] class = Not Attrition 1 MonthlyIncome <= 1927.5 gini = 0.43 samples = 335 value = [232, 103] class = Not Attrition 0->1 True 8 MonthlyIncome <= 10661.5 gini = 0.21 samples = 1135 value = [1001, 134] class = Not Attrition 0->8 False 2 MonthlyRate <= 24118.0 gini = 0.49 samples = 32 value = [14, 18] class = Attrition 1->2 5 MonthlyRate <= 3890.5 gini = 0.4 samples = 303 value = [218, 85] class = Not Attrition 1->5 3 gini = 0.5 samples = 27 value = [14, 13] class = Not Attrition 2->3 4 gini = 0.0 samples = 5 value = [0, 5] class = Attrition 2->4 6 gini = 0.49 samples = 21 value = [9, 12] class = Attrition 5->6 7 gini = 0.38 samples = 282 value = [209, 73] class = Not Attrition 5->7 9 MonthlyIncome <= 9790.0 gini = 0.23 samples = 896 value = [775, 121] class = Not Attrition 8->9 12 MonthlyRate <= 2582.0 gini = 0.1 samples = 239 value = [226, 13] class = Not Attrition 8->12 10 gini = 0.22 samples = 837 value = [732, 105] class = Not Attrition 9->10 11 gini = 0.4 samples = 59 value = [43, 16] class = Not Attrition 9->11 13 gini = 0.48 samples = 5 value = [3, 2] class = Not Attrition 12->13 14 gini = 0.09 samples = 234 value = [223, 11] class = Not Attrition 12->14
In [21]:
plot_decision_tree(features_cat,3,"entropy","best",
                   model_performance = True ,)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

 Classification report : 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       303
           1       1.00      0.09      0.17        65

    accuracy                           0.84       368
   macro avg       0.92      0.55      0.54       368
weighted avg       0.87      0.84      0.78       368

Accuracy   Score :  0.8396739130434783
Area under curve :  0.5461538461538462 

Tree 0 OverTime <= 0.5 entropy = 0.64 samples = 1470 value = [1233, 237] class = Not Attrition 1 JobLevel_1 <= 0.5 entropy = 0.48 samples = 1054 value = [944, 110] class = Not Attrition 0->1 True 8 JobLevel_1 <= 0.5 entropy = 0.89 samples = 416 value = [289, 127] class = Not Attrition 0->8 False 2 age_group_Age_0-25 <= 0.5 entropy = 0.38 samples = 667 value = [618, 49] class = Not Attrition 1->2 5 age_group_Age_0-25 <= 0.5 entropy = 0.63 samples = 387 value = [326, 61] class = Not Attrition 1->5 3 entropy = 0.38 samples = 651 value = [603, 48] class = Not Attrition 2->3 4 entropy = 0.34 samples = 16 value = [15, 1] class = Not Attrition 2->4 6 entropy = 0.57 samples = 319 value = [276, 43] class = Not Attrition 5->6 7 entropy = 0.83 samples = 68 value = [50, 18] class = Not Attrition 5->7 9 age_group_Age_0-25 <= 0.5 entropy = 0.66 samples = 260 value = [215, 45] class = Not Attrition 8->9 12 age_group_Age_0-25 <= 0.5 entropy = 1.0 samples = 156 value = [74, 82] class = Attrition 8->12 10 entropy = 0.66 samples = 253 value = [210, 43] class = Not Attrition 9->10 11 entropy = 0.86 samples = 7 value = [5, 2] class = Not Attrition 9->11 13 entropy = 1.0 samples = 124 value = [65, 59] class = Not Attrition 12->13 14 entropy = 0.86 samples = 32 value = [9, 23] class = Attrition 12->14
In [22]:
def data_Attrition_prediction_alg(algorithm,training_x,testing_x,
                                 training_y,testing_y,threshold_plot = True) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy Score   : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc)
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
     
    #plot roc curve
    trace1 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2),
                       )
    trace2 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot confusion matrix
    trace3 = go.Heatmap(z = conf_matrix ,x = ["Not churn","Churn"],
                        y = ["Not churn","Churn"],
                        showscale  = False,colorscale = "Blues",name = "matrix",
                        xaxis = "x2",yaxis = "y2"
                       )
    
    layout = go.Layout(dict(title="Model performance" ,
                            autosize = False,height = 500,width = 800,
                            showlegend = False,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(title = "false positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         domain=[0, 0.6],
                                         ticklen=5,gridwidth=2),
                            yaxis = dict(title = "true positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,gridwidth=2),
                            margin = dict(b=200),
                            xaxis2=dict(domain=[0.7, 1],tickangle = 90,
                                        gridcolor = 'rgb(255, 255, 255)'),
                            yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                           )
                  )
    plt_data = [trace1,trace2,trace3]
    fig = go.Figure(data=plt_data,layout=layout)
    
    py.iplot(fig)
    
    if threshold_plot == True : 
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x,training_y)
        visualizer.poof()

    
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
data_Attrition_prediction_alg(knn,os_smote_X,test_X,
                             os_smote_Y,test_Y,threshold_plot = True)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

 Classification report : 
               precision    recall  f1-score   support

           0       0.99      0.67      0.80       303
           1       0.39      0.97      0.55        65

    accuracy                           0.72       368
   macro avg       0.69      0.82      0.68       368
weighted avg       0.88      0.72      0.76       368

Accuracy Score   :  0.7228260869565217
Area under curve :  0.8195988829652197
In [23]:
from sklearn.ensemble import RandomForestClassifier

#function attributes
#columns  - column used
#nf_estimators   - The number of trees in the forest.
#estimated_tree  - tree number to be displayed
#maximum_depth   - depth of the tree
#criterion_type  - split criterion type ["gini" or "entropy"]
#Model performance - prints performance of model

def plot_tree_randomforest(columns,nf_estimators,
                           estimated_tree,maximum_depth,
                           criterion_type,model_performance = None) :
    
    dataframe = df_data_og[columns + target_col].copy()
    
    #train and test datasets
    rf_x     = dataframe[[i for i in columns if i not in target_col]]
    rf_y     = dataframe[target_col]
    
    #random forest classifier
    rfc   = RandomForestClassifier(n_estimators = nf_estimators,
                                   max_depth = maximum_depth,
                                   criterion = criterion_type,
                                  )
    rfc.fit(rf_x,rf_y)
    
    estimated_tree = rfc.estimators_[estimated_tree]
    
    graph = Source(tree.export_graphviz(estimated_tree,out_file=None,
                                        rounded=True,proportion = False,
                            feature_names = columns, 
                            precision  = 2,
                            class_names=["Not churn","Churn"],
                            filled = True))
    display(graph)
    
    #model performance
    if model_performance == True :
        data_attrition_prediction(rfc,
                                 rf_x,test_X[columns],
                                 rf_y,test_Y,
                                 columns,"features",threshold_plot = True)
        

cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
plot_tree_randomforest(cols1,100,99,3,"entropy",True)
Tree 0 YearsInCurrentRole <= 0.5 entropy = 0.64 samples = 929 value = [1229, 241] class = Not churn 1 JobInvolvement_3 <= 0.5 entropy = 0.91 samples = 158 value = [161, 76] class = Not churn 0->1 True 8 StockOptionLevel_1 <= 0.5 entropy = 0.57 samples = 771 value = [1068, 165] class = Not churn 0->8 False 2 age_group_Age_0-25 <= 0.5 entropy = 0.99 samples = 73 value = [58, 49] class = Not churn 1->2 5 BusinessTravel_Travel_Frequently <= 0.5 entropy = 0.74 samples = 85 value = [103, 27] class = Not churn 1->5 3 entropy = 0.96 samples = 57 value = [53, 32] class = Not churn 2->3 4 entropy = 0.77 samples = 16 value = [5, 17] class = Churn 2->4 6 entropy = 0.58 samples = 72 value = [93, 15] class = Not churn 5->6 7 entropy = 0.99 samples = 13 value = [10, 12] class = Churn 5->7 9 age_group_Age_25-35 <= 0.5 entropy = 0.69 samples = 438 value = [565, 126] class = Not churn 8->9 12 PercentSalaryHike <= 21.5 entropy = 0.37 samples = 333 value = [503, 39] class = Not churn 8->12 10 entropy = 0.55 samples = 243 value = [332, 49] class = Not churn 9->10 11 entropy = 0.81 samples = 195 value = [233, 77] class = Not churn 9->11 13 entropy = 0.4 samples = 301 value = [453, 39] class = Not churn 12->13 14 entropy = 0.0 samples = 32 value = [50, 0] class = Not churn 12->14
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       303
           1       1.00      0.03      0.06        65

    accuracy                           0.83       368
   macro avg       0.91      0.52      0.48       368
weighted avg       0.86      0.83      0.76       368

Accuracy   Score :  0.8288043478260869
Area under curve :  0.5153846153846153 

In [24]:
n = np.arange(0,10).tolist()
cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
for i in n :
    plot_tree_randomforest(cols1,10,i,3,"entropy",model_performance=False)
Tree 0 age_group_Age_0-25 <= 0.5 entropy = 0.66 samples = 906 value = [1218, 252] class = Not churn 1 StockOptionLevel_1 <= 0.5 entropy = 0.62 samples = 838 value = [1153, 212] class = Not churn 0->1 True 8 TotalWorkingYears <= 1.5 entropy = 0.96 samples = 68 value = [65, 40] class = Not churn 0->8 False 2 JobLevel_2 <= 0.5 entropy = 0.74 samples = 488 value = [620, 162] class = Not churn 1->2 5 BusinessTravel_Travel_Frequently <= 0.5 entropy = 0.42 samples = 350 value = [533, 50] class = Not churn 1->5 3 entropy = 0.84 samples = 289 value = [334, 122] class = Not churn 2->3 4 entropy = 0.54 samples = 199 value = [286, 40] class = Not churn 2->4 6 entropy = 0.34 samples = 283 value = [437, 30] class = Not churn 5->6 7 entropy = 0.66 samples = 67 value = [96, 20] class = Not churn 5->7 9 Gender <= 0.5 entropy = 0.84 samples = 25 value = [10, 27] class = Churn 8->9 12 RelationshipSatisfaction_4 <= 0.5 entropy = 0.7 samples = 43 value = [55, 13] class = Not churn 8->12 10 entropy = 1.0 samples = 13 value = [9, 9] class = Not churn 9->10 11 entropy = 0.3 samples = 12 value = [1, 18] class = Churn 9->11 13 entropy = 0.85 samples = 29 value = [31, 12] class = Not churn 12->13 14 entropy = 0.24 samples = 14 value = [24, 1] class = Not churn 12->14
Tree 0 age_group_Age_35-45 <= 0.5 entropy = 0.62 samples = 926 value = [1242, 228] class = Not churn 1 BusinessTravel_Travel_Frequently <= 0.5 entropy = 0.69 samples = 641 value = [827, 189] class = Not churn 0->1 True 8 Age <= 40.5 entropy = 0.42 samples = 285 value = [415, 39] class = Not churn 0->8 False 2 JobLevel_2 <= 0.5 entropy = 0.61 samples = 521 value = [699, 124] class = Not churn 1->2 5 OverTime <= 0.5 entropy = 0.92 samples = 120 value = [128, 65] class = Not churn 1->5 3 entropy = 0.71 samples = 347 value = [444, 108] class = Not churn 2->3 4 entropy = 0.32 samples = 174 value = [255, 16] class = Not churn 2->4 6 entropy = 0.76 samples = 80 value = [99, 28] class = Not churn 5->6 7 entropy = 0.99 samples = 40 value = [29, 37] class = Churn 5->7 9 StockOptionLevel_0 <= 0.5 entropy = 0.48 samples = 175 value = [258, 30] class = Not churn 8->9 12 WorkLifeBalance_1 <= 0.5 entropy = 0.3 samples = 110 value = [157, 9] class = Not churn 8->12 10 entropy = 0.29 samples = 108 value = [170, 9] class = Not churn 9->10 11 entropy = 0.71 samples = 67 value = [88, 21] class = Not churn 9->11 13 entropy = 0.26 samples = 105 value = [151, 7] class = Not churn 12->13 14 entropy = 0.81 samples = 5 value = [6, 2] class = Not churn 12->14
Tree 0 OverTime <= 0.5 entropy = 0.62 samples = 938 value = [1244, 226] class = Not churn 1 YearsAtCompany <= 1.5 entropy = 0.45 samples = 663 value = [943, 99] class = Not churn 0->1 True 8 YearsAtCompany <= 10.5 entropy = 0.88 samples = 275 value = [301, 127] class = Not churn 0->8 False 2 NumCompaniesWorked <= 2.5 entropy = 0.88 samples = 84 value = [86, 37] class = Not churn 1->2 5 TotalWorkingYears <= 18.5 entropy = 0.36 samples = 579 value = [857, 62] class = Not churn 1->5 3 entropy = 0.98 samples = 46 value = [41, 29] class = Not churn 2->3 4 entropy = 0.61 samples = 38 value = [45, 8] class = Not churn 2->4 6 entropy = 0.39 samples = 474 value = [686, 57] class = Not churn 5->6 7 entropy = 0.19 samples = 105 value = [171, 5] class = Not churn 5->7 9 MonthlyIncome <= 2475.0 entropy = 0.93 samples = 230 value = [236, 122] class = Not churn 8->9 12 TrainingTimesLastYear_3 <= 0.5 entropy = 0.37 samples = 45 value = [65, 5] class = Not churn 8->12 10 entropy = 0.91 samples = 43 value = [21, 43] class = Churn 9->10 11 entropy = 0.84 samples = 187 value = [215, 79] class = Not churn 9->11 13 entropy = 0.54 samples = 28 value = [35, 5] class = Not churn 12->13 14 entropy = 0.0 samples = 17 value = [30, 0] class = Not churn 12->14
Tree 0 StockOptionLevel_0 <= 0.5 entropy = 0.67 samples = 942 value = [1210, 260] class = Not churn 1 BusinessTravel_Travel_Frequently <= 0.5 entropy = 0.46 samples = 552 value = [762, 81] class = Not churn 0->1 True 8 JobInvolvement_1 <= 0.5 entropy = 0.86 samples = 390 value = [448, 179] class = Not churn 0->8 False 2 RelationshipSatisfaction_1 <= 0.5 entropy = 0.37 samples = 442 value = [628, 48] class = Not churn 1->2 5 PerformanceRating <= 0.5 entropy = 0.72 samples = 110 value = [134, 33] class = Not churn 1->5 3 entropy = 0.31 samples = 360 value = [529, 31] class = Not churn 2->3 4 entropy = 0.6 samples = 82 value = [99, 17] class = Not churn 2->4 6 entropy = 0.8 samples = 92 value = [102, 33] class = Not churn 5->6 7 entropy = 0.0 samples = 18 value = [32, 0] class = Not churn 5->7 9 Age <= 33.5 entropy = 0.82 samples = 367 value = [436, 148] class = Not churn 8->9 12 age_group_Age_0-25 <= 0.5 entropy = 0.85 samples = 23 value = [12, 31] class = Churn 8->12 10 entropy = 0.97 samples = 158 value = [156, 103] class = Not churn 9->10 11 entropy = 0.58 samples = 209 value = [280, 45] class = Not churn 9->11 13 entropy = 0.89 samples = 21 value = [12, 27] class = Churn 12->13 14 entropy = 0.0 samples = 2 value = [0, 4] class = Churn 12->14
Tree 0 OverTime <= 0.5 entropy = 0.65 samples = 935 value = [1226, 244] class = Not churn 1 YearsSinceLastPromotion <= 0.5 entropy = 0.49 samples = 674 value = [946, 113] class = Not churn 0->1 True 8 DistanceFromHome <= 11.5 entropy = 0.9 samples = 261 value = [280, 131] class = Not churn 0->8 False 2 RelationshipSatisfaction_1 <= 0.5 entropy = 0.62 samples = 255 value = [327, 60] class = Not churn 1->2 5 StockOptionLevel_2 <= 0.5 entropy = 0.4 samples = 419 value = [619, 53] class = Not churn 1->5 3 entropy = 0.55 samples = 203 value = [269, 39] class = Not churn 2->3 4 entropy = 0.84 samples = 52 value = [58, 21] class = Not churn 2->4 6 entropy = 0.43 samples = 374 value = [545, 52] class = Not churn 5->6 7 entropy = 0.1 samples = 45 value = [74, 1] class = Not churn 5->7 9 TotalWorkingYears <= 5.5 entropy = 0.8 samples = 174 value = [204, 65] class = Not churn 8->9 12 MaritalStatus_Single <= 0.5 entropy = 1.0 samples = 87 value = [76, 66] class = Not churn 8->12 10 entropy = 0.98 samples = 36 value = [23, 31] class = Churn 9->10 11 entropy = 0.63 samples = 138 value = [181, 34] class = Not churn 9->11 13 entropy = 0.87 samples = 63 value = [65, 27] class = Not churn 12->13 14 entropy = 0.76 samples = 24 value = [11, 39] class = Churn 12->14
Tree 0 OverTime <= 0.5 entropy = 0.6 samples = 943 value = [1254, 216] class = Not churn 1 MaritalStatus_Single <= 0.5 entropy = 0.46 samples = 680 value = [952, 101] class = Not churn 0->1 True 8 age_group_Age_25-35 <= 0.5 entropy = 0.85 samples = 263 value = [302, 115] class = Not churn 0->8 False 2 EducationField_Human Resources <= 0.5 entropy = 0.35 samples = 464 value = [685, 49] class = Not churn 1->2 5 Age <= 32.5 entropy = 0.64 samples = 216 value = [267, 52] class = Not churn 1->5 3 entropy = 0.33 samples = 452 value = [665, 43] class = Not churn 2->3 4 entropy = 0.78 samples = 12 value = [20, 6] class = Not churn 2->4 6 entropy = 0.86 samples = 87 value = [95, 37] class = Not churn 5->6 7 entropy = 0.4 samples = 129 value = [172, 15] class = Not churn 5->7 9 Age <= 30.5 entropy = 0.8 samples = 157 value = [182, 59] class = Not churn 8->9 12 MonthlyRate <= 22801.0 entropy = 0.9 samples = 106 value = [120, 56] class = Not churn 8->12 10 entropy = 0.89 samples = 24 value = [11, 25] class = Churn 9->10 11 entropy = 0.65 samples = 133 value = [171, 34] class = Not churn 9->11 13 entropy = 0.85 samples = 90 value = [110, 42] class = Not churn 12->13 14 entropy = 0.98 samples = 16 value = [10, 14] class = Churn 12->14
Tree 0 OverTime <= 0.5 entropy = 0.65 samples = 941 value = [1226, 244] class = Not churn 1 Age <= 32.5 entropy = 0.51 samples = 671 value = [913, 118] class = Not churn 0->1 True 8 JobLevel_1 <= 0.5 entropy = 0.86 samples = 270 value = [313, 126] class = Not churn 0->8 False 2 StockOptionLevel_0 <= 0.5 entropy = 0.68 samples = 259 value = [328, 73] class = Not churn 1->2 5 JobSatisfaction_1 <= 0.5 entropy = 0.37 samples = 412 value = [585, 45] class = Not churn 1->5 3 entropy = 0.52 samples = 147 value = [199, 26] class = Not churn 2->3 4 entropy = 0.84 samples = 112 value = [129, 47] class = Not churn 2->4 6 entropy = 0.29 samples = 334 value = [485, 26] class = Not churn 5->6 7 entropy = 0.63 samples = 78 value = [100, 19] class = Not churn 5->7 9 MaritalStatus_Single <= 0.5 entropy = 0.66 samples = 173 value = [235, 49] class = Not churn 8->9 12 StockOptionLevel_0 <= 0.5 entropy = 1.0 samples = 97 value = [78, 77] class = Not churn 8->12 10 entropy = 0.34 samples = 127 value = [191, 13] class = Not churn 9->10 11 entropy = 0.99 samples = 46 value = [44, 36] class = Not churn 9->11 13 entropy = 0.93 samples = 56 value = [59, 31] class = Not churn 12->13 14 entropy = 0.87 samples = 41 value = [19, 46] class = Churn 12->14
Tree 0 YearsWithCurrManager <= 1.5 entropy = 0.65 samples = 930 value = [1226, 244] class = Not churn 1 DailyRate <= 1097.5 entropy = 0.88 samples = 219 value = [238, 100] class = Not churn 0->1 True 8 JobLevel_1 <= 0.5 entropy = 0.55 samples = 711 value = [988, 144] class = Not churn 0->8 False 2 age_group_Age_0-25 <= 0.5 entropy = 0.94 samples = 156 value = [160, 89] class = Not churn 1->2 5 RelationshipSatisfaction_1 <= 0.5 entropy = 0.54 samples = 63 value = [78, 11] class = Not churn 1->5 3 entropy = 0.89 samples = 128 value = [143, 63] class = Not churn 2->3 4 entropy = 0.97 samples = 28 value = [17, 26] class = Churn 2->4 6 entropy = 0.42 samples = 57 value = [74, 7] class = Not churn 5->6 7 entropy = 1.0 samples = 6 value = [4, 4] class = Not churn 5->7 9 JobRole_Sales Executive <= 0.5 entropy = 0.42 samples = 478 value = [689, 63] class = Not churn 8->9 12 JobRole_Research Scientist <= 0.5 entropy = 0.75 samples = 233 value = [299, 81] class = Not churn 8->12 10 entropy = 0.3 samples = 315 value = [473, 27] class = Not churn 9->10 11 entropy = 0.59 samples = 163 value = [216, 36] class = Not churn 9->11 13 entropy = 0.86 samples = 119 value = [135, 54] class = Not churn 12->13 14 entropy = 0.59 samples = 114 value = [164, 27] class = Not churn 12->14
Tree 0 StockOptionLevel_1 <= 0.5 entropy = 0.63 samples = 940 value = [1238, 232] class = Not churn 1 EnvironmentSatisfaction_1 <= 0.5 entropy = 0.71 samples = 559 value = [702, 170] class = Not churn 0->1 True 8 PercentSalaryHike <= 18.5 entropy = 0.48 samples = 381 value = [536, 62] class = Not churn 0->8 False 2 YearsInCurrentRole <= 4.5 entropy = 0.65 samples = 439 value = [564, 113] class = Not churn 1->2 5 YearsWithCurrManager <= 0.5 entropy = 0.87 samples = 120 value = [138, 57] class = Not churn 1->5 3 entropy = 0.78 samples = 275 value = [318, 97] class = Not churn 2->3 4 entropy = 0.33 samples = 164 value = [246, 16] class = Not churn 2->4 6 entropy = 0.99 samples = 26 value = [19, 23] class = Churn 5->6 7 entropy = 0.76 samples = 94 value = [119, 34] class = Not churn 5->7 9 age_group_Age_0-25 <= 0.5 entropy = 0.55 samples = 301 value = [416, 60] class = Not churn 8->9 12 MonthlyIncome <= 4227.5 entropy = 0.12 samples = 80 value = [120, 2] class = Not churn 8->12 10 entropy = 0.48 samples = 292 value = [412, 48] class = Not churn 9->10 11 entropy = 0.81 samples = 9 value = [4, 12] class = Churn 9->11 13 entropy = 0.25 samples = 31 value = [45, 2] class = Not churn 12->13 14 entropy = 0.0 samples = 49 value = [75, 0] class = Not churn 12->14
Tree 0 StockOptionLevel_0 <= 0.5 entropy = 0.64 samples = 931 value = [1231, 239] class = Not churn 1 Age <= 29.5 entropy = 0.46 samples = 527 value = [769, 83] class = Not churn 0->1 True 8 JobLevel_1 <= 0.5 entropy = 0.82 samples = 404 value = [462, 156] class = Not churn 0->8 False 2 MonthlyRate <= 13926.0 entropy = 0.8 samples = 104 value = [131, 42] class = Not churn 1->2 5 MonthlyIncome <= 2443.5 entropy = 0.33 samples = 423 value = [638, 41] class = Not churn 1->5 3 entropy = 0.58 samples = 58 value = [82, 13] class = Not churn 2->3 4 entropy = 0.95 samples = 46 value = [49, 29] class = Not churn 2->4 6 entropy = 0.66 samples = 45 value = [59, 12] class = Not churn 5->6 7 entropy = 0.28 samples = 378 value = [579, 29] class = Not churn 5->7 9 DistanceFromHome <= 22.5 entropy = 0.58 samples = 237 value = [296, 48] class = Not churn 8->9 12 YearsAtCompany <= 1.5 entropy = 0.97 samples = 167 value = [166, 108] class = Not churn 8->12 10 entropy = 0.48 samples = 213 value = [273, 32] class = Not churn 9->10 11 entropy = 0.98 samples = 24 value = [23, 16] class = Not churn 9->11 13 entropy = 0.97 samples = 50 value = [33, 49] class = Churn 12->13 14 entropy = 0.89 samples = 117 value = [133, 59] class = Not churn 12->14
In [25]:
n = np.arange(0,10).tolist()
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist() 
for i in n :
    plot_tree_randomforest(cols,10,i,3,"gini",model_performance=False)
Tree 0 Education_3 <= 0.5 gini = 0.25 samples = 954 value = [1257, 213] class = Not churn 1 Education_5 <= 0.5 gini = 0.23 samples = 592 value = [788, 124] class = Not churn 0->1 True 8 RelationshipSatisfaction_4 <= 0.5 gini = 0.27 samples = 362 value = [469, 89] class = Not churn 0->8 False 2 RelationshipSatisfaction_3 <= 0.5 gini = 0.24 samples = 566 value = [752, 121] class = Not churn 1->2 5 RelationshipSatisfaction_4 <= 0.5 gini = 0.14 samples = 26 value = [36, 3] class = Not churn 1->5 3 gini = 0.23 samples = 392 value = [527, 81] class = Not churn 2->3 4 gini = 0.26 samples = 174 value = [225, 40] class = Not churn 2->4 6 gini = 0.2 samples = 19 value = [24, 3] class = Not churn 5->6 7 gini = 0.0 samples = 7 value = [12, 0] class = Not churn 5->7 9 RelationshipSatisfaction_1 <= 0.5 gini = 0.29 samples = 264 value = [330, 72] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.19 samples = 98 value = [139, 17] class = Not churn 8->12 10 gini = 0.28 samples = 193 value = [243, 49] class = Not churn 9->10 11 gini = 0.33 samples = 71 value = [87, 23] class = Not churn 9->11 13 gini = 0.19 samples = 82 value = [115, 14] class = Not churn 12->13 14 gini = 0.2 samples = 16 value = [24, 3] class = Not churn 12->14
Tree 0 RelationshipSatisfaction_1 <= 0.5 gini = 0.29 samples = 929 value = [1213, 257] class = Not churn 1 Education_4 <= 0.5 gini = 0.27 samples = 752 value = [998, 192] class = Not churn 0->1 True 8 Education_3 <= 0.5 gini = 0.36 samples = 177 value = [215, 65] class = Not churn 0->8 False 2 Education_3 <= 0.5 gini = 0.25 samples = 537 value = [732, 124] class = Not churn 1->2 5 RelationshipSatisfaction_2 <= 0.5 gini = 0.32 samples = 215 value = [266, 68] class = Not churn 1->5 3 gini = 0.19 samples = 244 value = [344, 42] class = Not churn 2->3 4 gini = 0.29 samples = 293 value = [388, 82] class = Not churn 2->4 6 gini = 0.34 samples = 162 value = [203, 57] class = Not churn 5->6 7 gini = 0.25 samples = 53 value = [63, 11] class = Not churn 5->7 9 Education_4 <= 0.5 gini = 0.3 samples = 109 value = [134, 30] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.42 samples = 68 value = [81, 35] class = Not churn 8->12 10 gini = 0.37 samples = 70 value = [77, 25] class = Not churn 9->10 11 gini = 0.15 samples = 39 value = [57, 5] class = Not churn 9->11 13 gini = 0.42 samples = 57 value = [68, 30] class = Not churn 12->13 14 gini = 0.4 samples = 11 value = [13, 5] class = Not churn 12->14
Tree 0 Education_5 <= 0.5 gini = 0.26 samples = 916 value = [1243, 227] class = Not churn 1 Education_2 <= 0.5 gini = 0.27 samples = 889 value = [1198, 224] class = Not churn 0->1 True 8 RelationshipSatisfaction_1 <= 0.5 gini = 0.12 samples = 27 value = [45, 3] class = Not churn 0->8 False 2 Education_1 <= 0.5 gini = 0.28 samples = 716 value = [957, 190] class = Not churn 1->2 5 RelationshipSatisfaction_2 <= 0.5 gini = 0.22 samples = 173 value = [241, 34] class = Not churn 1->5 3 gini = 0.27 samples = 600 value = [807, 157] class = Not churn 2->3 4 gini = 0.3 samples = 116 value = [150, 33] class = Not churn 2->4 6 gini = 0.24 samples = 147 value = [199, 32] class = Not churn 5->6 7 gini = 0.09 samples = 26 value = [42, 2] class = Not churn 5->7 9 EnvironmentSatisfaction_2 <= 0.5 gini = 0.05 samples = 22 value = [38, 1] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.35 samples = 5 value = [7, 2] class = Not churn 8->12 10 gini = 0.08 samples = 13 value = [24, 1] class = Not churn 9->10 11 gini = 0.0 samples = 9 value = [14, 0] class = Not churn 9->11 13 gini = 0.5 samples = 3 value = [2, 2] class = Not churn 12->13 14 gini = 0.0 samples = 2 value = [5, 0] class = Not churn 12->14
Tree 0 RelationshipSatisfaction_3 <= 0.5 gini = 0.29 samples = 949 value = [1215, 255] class = Not churn 1 Education_1 <= 0.5 gini = 0.3 samples = 651 value = [816, 187] class = Not churn 0->1 True 8 Education_5 <= 0.5 gini = 0.25 samples = 298 value = [399, 68] class = Not churn 0->8 False 2 Education_3 <= 0.5 gini = 0.29 samples = 581 value = [741, 161] class = Not churn 1->2 5 RelationshipSatisfaction_1 <= 0.5 gini = 0.38 samples = 70 value = [75, 26] class = Not churn 1->5 3 gini = 0.27 samples = 333 value = [440, 84] class = Not churn 2->3 4 gini = 0.32 samples = 248 value = [301, 77] class = Not churn 2->4 6 gini = 0.35 samples = 46 value = [57, 17] class = Not churn 5->6 7 gini = 0.44 samples = 24 value = [18, 9] class = Not churn 5->7 9 Education_1 <= 0.5 gini = 0.25 samples = 287 value = [383, 67] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.11 samples = 11 value = [16, 1] class = Not churn 8->12 10 gini = 0.27 samples = 257 value = [339, 64] class = Not churn 9->10 11 gini = 0.12 samples = 30 value = [44, 3] class = Not churn 9->11 13 gini = 0.2 samples = 7 value = [8, 1] class = Not churn 12->13 14 gini = 0.0 samples = 4 value = [8, 0] class = Not churn 12->14
Tree 0 Education_2 <= 0.5 gini = 0.28 samples = 948 value = [1226, 244] class = Not churn 1 Education_5 <= 0.5 gini = 0.29 samples = 772 value = [976, 206] class = Not churn 0->1 True 8 RelationshipSatisfaction_1 <= 0.5 gini = 0.23 samples = 176 value = [250, 38] class = Not churn 0->8 False 2 Education_4 <= 0.5 gini = 0.29 samples = 736 value = [925, 202] class = Not churn 1->2 5 RelationshipSatisfaction_2 <= 0.5 gini = 0.13 samples = 36 value = [51, 4] class = Not churn 1->5 3 gini = 0.3 samples = 478 value = [593, 136] class = Not churn 2->3 4 gini = 0.28 samples = 258 value = [332, 66] class = Not churn 2->4 6 gini = 0.09 samples = 27 value = [41, 2] class = Not churn 5->6 7 gini = 0.28 samples = 9 value = [10, 2] class = Not churn 5->7 9 RelationshipSatisfaction_4 <= 0.5 gini = 0.2 samples = 144 value = [203, 25] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.34 samples = 32 value = [47, 13] class = Not churn 8->12 10 gini = 0.16 samples = 80 value = [114, 11] class = Not churn 9->10 11 gini = 0.23 samples = 64 value = [89, 14] class = Not churn 9->11 13 gini = 0.34 samples = 28 value = [43, 12] class = Not churn 12->13 14 gini = 0.32 samples = 4 value = [4, 1] class = Not churn 12->14
Tree 0 Education_5 <= 0.5 gini = 0.28 samples = 937 value = [1222, 248] class = Not churn 1 Education_4 <= 0.5 gini = 0.28 samples = 905 value = [1175, 244] class = Not churn 0->1 True 8 RelationshipSatisfaction_4 <= 0.5 gini = 0.14 samples = 32 value = [47, 4] class = Not churn 0->8 False 2 RelationshipSatisfaction_4 <= 0.5 gini = 0.3 samples = 656 value = [852, 189] class = Not churn 1->2 5 RelationshipSatisfaction_4 <= 0.5 gini = 0.25 samples = 249 value = [323, 55] class = Not churn 1->5 3 gini = 0.32 samples = 469 value = [597, 146] class = Not churn 2->3 4 gini = 0.25 samples = 187 value = [255, 43] class = Not churn 2->4 6 gini = 0.24 samples = 171 value = [226, 37] class = Not churn 5->6 7 gini = 0.26 samples = 78 value = [97, 18] class = Not churn 5->7 9 RelationshipSatisfaction_1 <= 0.5 gini = 0.21 samples = 22 value = [30, 4] class = Not churn 8->9 12 gini = 0.0 samples = 10 value = [17, 0] class = Not churn 8->12 10 gini = 0.24 samples = 18 value = [25, 4] class = Not churn 9->10 11 gini = 0.0 samples = 4 value = [5, 0] class = Not churn 9->11
Tree 0 Education_5 <= 0.5 gini = 0.28 samples = 907 value = [1224, 246] class = Not churn 1 Education_1 <= 0.5 gini = 0.28 samples = 883 value = [1190, 244] class = Not churn 0->1 True 8 RelationshipSatisfaction_4 <= 0.5 gini = 0.1 samples = 24 value = [34, 2] class = Not churn 0->8 False 2 Education_2 <= 0.5 gini = 0.27 samples = 783 value = [1065, 209] class = Not churn 1->2 5 RelationshipSatisfaction_3 <= 0.5 gini = 0.34 samples = 100 value = [125, 35] class = Not churn 1->5 3 gini = 0.29 samples = 611 value = [819, 171] class = Not churn 2->3 4 gini = 0.23 samples = 172 value = [246, 38] class = Not churn 2->4 6 gini = 0.38 samples = 71 value = [88, 30] class = Not churn 5->6 7 gini = 0.21 samples = 29 value = [37, 5] class = Not churn 5->7 9 RelationshipSatisfaction_1 <= 0.5 gini = 0.12 samples = 19 value = [28, 2] class = Not churn 8->9 12 gini = 0.0 samples = 5 value = [6, 0] class = Not churn 8->12 10 gini = 0.15 samples = 15 value = [23, 2] class = Not churn 9->10 11 gini = 0.0 samples = 4 value = [5, 0] class = Not churn 9->11
Tree 0 RelationshipSatisfaction_3 <= 0.5 gini = 0.26 samples = 905 value = [1246, 224] class = Not churn 1 Education_5 <= 0.5 gini = 0.27 samples = 621 value = [838, 165] class = Not churn 0->1 True 6 Education_1 <= 0.5 gini = 0.22 samples = 284 value = [408, 59] class = Not churn 0->6 False 2 RelationshipSatisfaction_2 <= 0.5 gini = 0.28 samples = 604 value = [813, 165] class = Not churn 1->2 5 gini = 0.0 samples = 17 value = [25, 0] class = Not churn 1->5 3 gini = 0.27 samples = 435 value = [594, 116] class = Not churn 2->3 4 gini = 0.3 samples = 169 value = [219, 49] class = Not churn 2->4 7 Education_4 <= 0.5 gini = 0.21 samples = 251 value = [369, 49] class = Not churn 6->7 10 EnvironmentSatisfaction_2 <= 0.5 gini = 0.32 samples = 33 value = [39, 10] class = Not churn 6->10 8 gini = 0.19 samples = 178 value = [275, 32] class = Not churn 7->8 9 gini = 0.26 samples = 73 value = [94, 17] class = Not churn 7->9 11 gini = 0.32 samples = 28 value = [32, 8] class = Not churn 10->11 12 gini = 0.35 samples = 5 value = [7, 2] class = Not churn 10->12
Tree 0 RelationshipSatisfaction_3 <= 0.5 gini = 0.3 samples = 924 value = [1201, 269] class = Not churn 1 RelationshipSatisfaction_2 <= 0.5 gini = 0.31 samples = 639 value = [808, 194] class = Not churn 0->1 True 8 Education_4 <= 0.5 gini = 0.27 samples = 285 value = [393, 75] class = Not churn 0->8 False 2 Education_4 <= 0.5 gini = 0.32 samples = 445 value = [548, 138] class = Not churn 1->2 5 Education_4 <= 0.5 gini = 0.29 samples = 194 value = [260, 56] class = Not churn 1->5 3 gini = 0.33 samples = 332 value = [401, 107] class = Not churn 2->3 4 gini = 0.29 samples = 113 value = [147, 31] class = Not churn 2->4 6 gini = 0.28 samples = 135 value = [187, 37] class = Not churn 5->6 7 gini = 0.33 samples = 59 value = [73, 19] class = Not churn 5->7 9 Education_2 <= 0.5 gini = 0.23 samples = 207 value = [293, 44] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.36 samples = 78 value = [100, 31] class = Not churn 8->12 10 gini = 0.26 samples = 154 value = [205, 38] class = Not churn 9->10 11 gini = 0.12 samples = 53 value = [88, 6] class = Not churn 9->11 13 gini = 0.32 samples = 61 value = [75, 19] class = Not churn 12->13 14 gini = 0.44 samples = 17 value = [25, 12] class = Not churn 12->14
Tree 0 RelationshipSatisfaction_4 <= 0.5 gini = 0.28 samples = 922 value = [1222, 248] class = Not churn 1 RelationshipSatisfaction_3 <= 0.5 gini = 0.3 samples = 665 value = [862, 196] class = Not churn 0->1 True 8 Education_3 <= 0.5 gini = 0.22 samples = 257 value = [360, 52] class = Not churn 0->8 False 2 Education_4 <= 0.5 gini = 0.32 samples = 363 value = [466, 115] class = Not churn 1->2 5 Education_1 <= 0.5 gini = 0.28 samples = 302 value = [396, 81] class = Not churn 1->5 3 gini = 0.35 samples = 262 value = [322, 92] class = Not churn 2->3 4 gini = 0.24 samples = 101 value = [144, 23] class = Not churn 2->4 6 gini = 0.27 samples = 264 value = [348, 68] class = Not churn 5->6 7 gini = 0.34 samples = 38 value = [48, 13] class = Not churn 5->7 9 Education_2 <= 0.5 gini = 0.19 samples = 163 value = [230, 27] class = Not churn 8->9 12 EnvironmentSatisfaction_2 <= 0.5 gini = 0.27 samples = 94 value = [130, 25] class = Not churn 8->12 10 gini = 0.22 samples = 115 value = [156, 22] class = Not churn 9->10 11 gini = 0.12 samples = 48 value = [74, 5] class = Not churn 9->11 13 gini = 0.28 samples = 80 value = [111, 22] class = Not churn 12->13 14 gini = 0.24 samples = 14 value = [19, 3] class = Not churn 12->14
In [26]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)

data_Attrition_prediction_alg(gnb,os_smote_X,test_X,os_smote_Y,test_Y)
GaussianNB(priors=None, var_smoothing=1e-09)

 Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.73      0.79       303
           1       0.27      0.48      0.35        65

    accuracy                           0.68       368
   macro avg       0.57      0.60      0.57       368
weighted avg       0.76      0.68      0.71       368

Accuracy Score   :  0.6820652173913043
Area under curve :  0.6014978420919016
In [27]:
from sklearn.svm import SVC

#Support vector classifier
#using linear hyper plane
svc_lin  = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
               decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
               max_iter=-1, probability=True, random_state=None, shrinking=True,
               tol=0.001, verbose=False)

cols = [i for i in data.columns if i not in Id_col + target_col]
data_attrition_prediction(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"coefficients",threshold_plot = False)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       303
           1       0.78      0.43      0.55        65

    accuracy                           0.88       368
   macro avg       0.83      0.70      0.74       368
weighted avg       0.87      0.88      0.86       368

Accuracy   Score :  0.8777173913043478
Area under curve :  0.7021832952526023 

In [29]:
#tuning parameters
#Support vector classifier
#using non-linear hyper plane("rbf")

svc_rbf  = SVC(C=1.0, kernel='rbf', 
               degree= 3, gamma=1.0, 
               coef0=0.0, shrinking=True,
               probability=True,tol=0.001,
               cache_size=200, class_weight=None,
               verbose=False,max_iter= -1,
               random_state=None)

data_Attrition_prediction_alg(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,threshold_plot = False)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       303
           1       1.00      0.88      0.93        65

    accuracy                           0.98       368
   macro avg       0.99      0.94      0.96       368
weighted avg       0.98      0.98      0.98       368

Accuracy Score   :  0.9782608695652174
Area under curve :  0.9384615384615385
In [32]:
from lightgbm import LGBMClassifier

lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.5, max_depth=7, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=0)

cols = [i for i in data.columns if i not in Id_col + target_col]
data_attrition_prediction(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"features",threshold_plot = True)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.5, max_depth=7,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=500, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

 Classification report : 
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       303
           1       0.92      0.94      0.93        65

    accuracy                           0.98       368
   macro avg       0.96      0.96      0.96       368
weighted avg       0.98      0.98      0.98       368

Accuracy   Score :  0.9755434782608695
Area under curve :  0.960979944148261 

In [36]:
from xgboost import XGBClassifier

xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                    max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)


data_attrition_prediction(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"features",threshold_plot = True)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.9, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=1, nthread=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
              subsample=1, tree_method=None, validate_parameters=False,
              verbosity=None)

 Classification report : 
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       303
           1       0.90      0.94      0.92        65

    accuracy                           0.97       368
   macro avg       0.94      0.96      0.95       368
weighted avg       0.97      0.97      0.97       368

Accuracy   Score :  0.970108695652174
Area under curve :  0.9576796141152577 

In [38]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    roc_auc      = roc_auc_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 
    kappa_metric = cohen_kappa_score(testing_y,predictions)
    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area_under_curve": [roc_auc],
                       "Kappa_metric"    : [kappa_metric],
                      })
    return df

#outputs for every model
model1 = model_report(logit,train_X,test_X,train_Y,test_Y,
                      "Logistic Regression(Baseline_model)")
model2 = model_report(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
                      "Logistic Regression(SMOTE)")
model3 = model_report(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                      "Logistic Regression(RFE)")
decision_tree = DecisionTreeClassifier(max_depth = 9,
                                       random_state = 123,
                                       splitter  = "best",
                                       criterion = "gini",
                                      )
model4 = model_report(decision_tree,train_X,test_X,train_Y,test_Y,
                      "Decision Tree")
model5 = model_report(knn,os_smote_X,test_X,os_smote_Y,test_Y,
                      "KNN Classifier")
rfc = RandomForestClassifier(n_estimators = 1000,
                             random_state = 123,
                             max_depth = 9,
                             criterion = "gini")
model6 = model_report(rfc,train_X,test_X,train_Y,test_Y,
                      "Random Forest Classifier")
model7 = model_report(gnb,os_smote_X,test_X,os_smote_Y,test_Y,
                      "Naive Bayes")
model8 = model_report(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
                      "SVM Classifier Linear")
model9 = model_report(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,
                      "SVM Classifier RBF")
model10 = model_report(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
                      "LGBM Classifier")
model11 = model_report(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
                      "XGBoost Classifier")

#concat all models
model_performances = pd.concat([model1,model2,model3,
                                model4,model5,model6,
                                model7,model8,model9,
                                model10,model11],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)
In [39]:
model_performances
def output_tracer(metric,color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h",name = metric ,
                    marker = dict(line = dict(width =.7),
                                  color = color)
                   )
    return tracer

layout = go.Layout(dict(title = "Model performances",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "metric",
                                     zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(l = 250),
                        height = 780
                       )
                  )


trace1  = output_tracer("Accuracy_score","#6699FF")
trace2  = output_tracer('Recall_score',"red")
trace3  = output_tracer('Precision',"#33CC99")
trace4  = output_tracer('f1_score',"lightgrey")
trace5  = output_tracer('Kappa_metric',"#FFCC99")

data = [trace1,trace2,trace3,trace4,trace5]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
In [40]:
lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,15))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    plt.subplot(4,3,j+1)
    predictions = i.predict(test_X)
    conf_matrix = confusion_matrix(predictions,test_Y)
    sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True,
                xticklabels=["not churn","churn"],
                yticklabels=["not churn","churn"],
                linewidths = 2,linecolor = "w",cmap = "Set1")
    plt.title(k,color = "b")
    plt.subplots_adjust(wspace = .3,hspace = .3)
In [41]:
lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

plt.style.use("dark_background")
fig = plt.figure(figsize=(12,16))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    fpr,tpr,thresholds = roc_curve(test_Y,probabilities[:,1])
    plt.plot(fpr,tpr,linestyle = "dotted",
             color = "royalblue",linewidth = 2,
             label = "AUC = " + str(np.around(roc_auc_score(test_Y,predictions),3)))
    plt.plot([0,1],[0,1],linestyle = "dashed",
             color = "orangered",linewidth = 1.5)
    plt.fill_between(fpr,tpr,alpha = .4)
    plt.fill_between([0,1],[0,1],color = "k")
    plt.legend(loc = "lower right",
               prop = {"size" : 12})
    qx.set_facecolor("k")
    plt.grid(True,alpha = .15)
    plt.title(k,color = "b")
    plt.xticks(np.arange(0,1,.3))
    plt.yticks(np.arange(0,1,.3))
In [42]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score


lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,17))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    recall,precision,thresholds = precision_recall_curve(test_Y,probabilities[:,1])
    plt.plot(recall,precision,linewidth = 1.5,
             label = ("avg_pcn : " + 
                      str(np.around(average_precision_score(test_Y,predictions),3))))
    plt.plot([0,1],[0,0],linestyle = "dashed")
    plt.fill_between(recall,precision,alpha = .2)
    plt.legend(loc = "lower left",
               prop = {"size" : 10})
    qx.set_facecolor("k")
    plt.grid(True,alpha = .15)
    plt.title(k,color = "b")
    plt.xlabel("recall",fontsize =7)
    plt.ylabel("precision",fontsize =7)
    plt.xlim([0.25,1])
    plt.yticks(np.arange(0,1,.3))
In [ ]: